Import Libraries

In [2]:
# Import required libraries or modules
import numpy as np
# Import required libraries or modules
import pandas as pd
# Import required libraries or modules
import matplotlib.pyplot as plt
# Import required libraries or modules
import seaborn as sns
# Import required libraries or modules
import warnings
# Filter rows based on a condition
warnings.filterwarnings('ignore')
# Generate a plot or visualize data
%matplotlib inline
In [3]:
#1)Import Data
In [4]:
# Load data into the DataFrame or variable
taxi_sep_2024=pd.read_parquet('/Users/udaykola/Downloads/yellow_tripdata_2024-09.parquet')
taxi_data=pd.concat([taxi_sep_2024])
In [5]:
taxi_data.head()
Out[5]:
VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount congestion_surcharge Airport_fee
0 1 2024-09-01 00:05:51 2024-09-01 00:45:03 1.0 9.80 1.0 N 138 48 1 47.8 10.25 0.5 13.30 6.94 1.0 79.79 2.5 1.75
1 1 2024-09-01 00:59:35 2024-09-01 01:03:43 1.0 0.50 1.0 N 140 141 1 5.1 3.50 0.5 3.00 0.00 1.0 13.10 2.5 0.00
2 2 2024-09-01 00:25:00 2024-09-01 00:34:37 2.0 2.29 1.0 N 238 152 2 13.5 1.00 0.5 0.00 0.00 1.0 16.00 0.0 0.00
3 2 2024-09-01 00:31:00 2024-09-01 00:46:52 1.0 5.20 1.0 N 93 130 1 24.7 1.00 0.5 4.55 0.00 1.0 31.75 0.0 0.00
4 2 2024-09-01 00:11:57 2024-09-01 00:30:41 2.0 2.26 1.0 N 79 231 1 17.0 1.00 0.5 4.40 0.00 1.0 26.40 2.5 0.00
In [6]:
taxi_data.shape
Out[6]:
(3633030, 19)

2)DAta Exploration¶

In [8]:
taxi_data.columns
Out[8]:
Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'Airport_fee'],
      dtype='object')
In [9]:
taxi_data = taxi_data[['tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID','PULocationID',
       'DOLocationID', 'payment_type',  'total_amount']]
In [10]:
taxi_data.head()
Out[10]:
tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance RatecodeID PULocationID DOLocationID payment_type total_amount
0 2024-09-01 00:05:51 2024-09-01 00:45:03 1.0 9.80 1.0 138 48 1 79.79
1 2024-09-01 00:59:35 2024-09-01 01:03:43 1.0 0.50 1.0 140 141 1 13.10
2 2024-09-01 00:25:00 2024-09-01 00:34:37 2.0 2.29 1.0 238 152 2 16.00
3 2024-09-01 00:31:00 2024-09-01 00:46:52 1.0 5.20 1.0 93 130 1 31.75
4 2024-09-01 00:11:57 2024-09-01 00:30:41 2.0 2.26 1.0 79 231 1 26.40
In [11]:
taxi_data.shape
Out[11]:
(3633030, 9)
In [12]:
taxi_data.hist(figsize=(20,10),bins=60)
# Display the result or DataFrame contents
plt.show()
No description has been provided for this image
In [13]:
taxi_data[taxi_data['total_amount']<0].shape
Out[13]:
(56109, 9)
In [14]:
# Generate a plot or visualize data
taxi_data.reset_index().plot(kind='scatter',x='index',y='total_amount',figsize=(10,5))
# Display the result or DataFrame contents
plt.show()
No description has been provided for this image
In [15]:
# Generate a plot or visualize data
taxi_data[taxi_data['total_amount'] <1000].reset_index().plot(
    kind='scatter', x='index', y='total_amount', figsize=(10, 5))
# Display the result or DataFrame contents
plt.show()
No description has been provided for this image
In [16]:
# Generate a plot or visualize data
taxi_data[taxi_data['total_amount']<0].reset_index().plot(kind='scatter',x='index',y='total_amount',figsize=(10,5))
# Display the result or DataFrame contents
plt.show()
No description has been provided for this image
In [17]:
taxi_data[taxi_data['total_amount']<0]['payment_type'].value_counts()
Out[17]:
payment_type
4    34193
2    13540
3     8016
0      348
1       12
Name: count, dtype: int64
In [18]:
# Import required libraries or modules
import matplotlib.pyplot as plt

# Plotting the histogram
taxi_data[taxi_data['total_amount'] < 0]['trip_distance'].hist(
    bins=60, figsize=(10, 5), color='blue', alpha=0.7
)

# Adding labels and title
plt.title("Histogram of Trip Distance for Negative Total Amounts", fontsize=14)
plt.xlabel("Trip Distance", fontsize=12)
plt.ylabel("Frequency", fontsize=12)

# Generate a plot or visualize data
# Display the plot
# Display the result or DataFrame contents
plt.show()
No description has been provided for this image
In [19]:
taxi_data[taxi_data['total_amount']==0].shape
Out[19]:
(431, 9)
In [20]:
taxi_data[taxi_data['total_amount']>200].shape
Out[20]:
(2983, 9)
In [21]:
taxi_data[taxi_data['total_amount']>350].shape
Out[21]:
(440, 9)
In [22]:
taxi_data['total_amount'].mean()
Out[22]:
28.539097904503926
In [23]:
#3)Data Cldeaning

# Filter rows based on a condition
taxi_data_filtered=taxi_data[(taxi_data['total_amount']>0) & (taxi_data['total_amount']<350)]
# Filter rows based on a condition
taxi_data_filtered.head()
Out[23]:
tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance RatecodeID PULocationID DOLocationID payment_type total_amount
0 2024-09-01 00:05:51 2024-09-01 00:45:03 1.0 9.80 1.0 138 48 1 79.79
1 2024-09-01 00:59:35 2024-09-01 01:03:43 1.0 0.50 1.0 140 141 1 13.10
2 2024-09-01 00:25:00 2024-09-01 00:34:37 2.0 2.29 1.0 238 152 2 16.00
3 2024-09-01 00:31:00 2024-09-01 00:46:52 1.0 5.20 1.0 93 130 1 31.75
4 2024-09-01 00:11:57 2024-09-01 00:30:41 2.0 2.26 1.0 79 231 1 26.40
In [24]:
taxi_data.shape
Out[24]:
(3633030, 9)
In [25]:
# Filter rows based on a condition
taxi_data_filtered.shape
Out[25]:
(3576050, 9)
In [26]:
# Filter rows based on a condition
taxi_data_filtered.isnull().sum()
Out[26]:
tpep_pickup_datetime          0
tpep_dropoff_datetime         0
passenger_count          483316
trip_distance                 0
RatecodeID               483316
PULocationID                  0
DOLocationID                  0
payment_type                  0
total_amount                  0
dtype: int64
In [27]:
# Filter rows based on a condition
taxi_data_filtered['passenger_count'].mean()
Out[27]:
1.3021578965407306
In [28]:
# Filter rows based on a condition
taxi_data_filtered['RatecodeID'].mean()
Out[28]:
2.283585009250715
In [29]:
# Filter rows based on a condition
taxi_data_filtered[taxi_data_filtered['passenger_count'].isnull()].reset_index().plot(
    kind='scatter',x='index',y='total_amount',figsize=(10,5))
# Display the result or DataFrame contents
plt.show()
No description has been provided for this image
In [30]:
# Filter rows based on a condition
taxi_data_filtered[taxi_data_filtered['passenger_count'].isnull()].reset_index().plot(
    kind='scatter',x='index',y='total_amount',figsize=(10,5))
# Display the result or DataFrame contents
plt.show()
No description has been provided for this image
In [31]:
# Filter rows based on a condition
taxi_data_filtered=taxi_data_filtered.dropna()
# Filter rows based on a condition
taxi_data_filtered.shape
Out[31]:
(3092734, 9)
In [32]:
#4)data Preparation
# Filter rows based on a condition
taxi_data_prepared=taxi_data_filtered.copy()
In [33]:
taxi_data_prepared.dtypes
Out[33]:
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
PULocationID                      int32
DOLocationID                      int32
payment_type                      int64
total_amount                    float64
dtype: object
In [34]:
taxi_data_prepared.loc[:,'RatecodeID']=taxi_data_prepared['RatecodeID'].astype(str)
taxi_data_prepared.loc[:,'PULocationID']=taxi_data_prepared['PULocationID'].astype(str)
taxi_data_prepared.loc[:,'DOLocationID']=taxi_data_prepared['DOLocationID'].astype(str)
taxi_data_prepared.loc[:,'payment_type']=taxi_data_prepared['payment_type'].astype(str)
In [35]:
taxi_data_prepared.dtypes
Out[35]:
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                 float64
trip_distance                   float64
RatecodeID                       object
PULocationID                     object
DOLocationID                     object
payment_type                     object
total_amount                    float64
dtype: object
In [36]:
taxi_data_prepared.head()
Out[36]:
tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance RatecodeID PULocationID DOLocationID payment_type total_amount
0 2024-09-01 00:05:51 2024-09-01 00:45:03 1.0 9.80 1.0 138 48 1 79.79
1 2024-09-01 00:59:35 2024-09-01 01:03:43 1.0 0.50 1.0 140 141 1 13.10
2 2024-09-01 00:25:00 2024-09-01 00:34:37 2.0 2.29 1.0 238 152 2 16.00
3 2024-09-01 00:31:00 2024-09-01 00:46:52 1.0 5.20 1.0 93 130 1 31.75
4 2024-09-01 00:11:57 2024-09-01 00:30:41 2.0 2.26 1.0 79 231 1 26.40
In [37]:
taxi_data_prepared['transaction_date']=pd.to_datetime(taxi_data_prepared['tpep_pickup_datetime'].dt.date)
taxi_data_prepared['transaction_year']=taxi_data_prepared['tpep_pickup_datetime'].dt.year
taxi_data_prepared['transaction_month']=taxi_data_prepared['tpep_pickup_datetime'].dt.month
taxi_data_prepared['transaction_day']=taxi_data_prepared['tpep_pickup_datetime'].dt.day
taxi_data_prepared['transaction_hour']=taxi_data_prepared['tpep_pickup_datetime'].dt.hour
In [38]:
taxi_data_prepared.head()
Out[38]:
tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance RatecodeID PULocationID DOLocationID payment_type total_amount transaction_date transaction_year transaction_month transaction_day transaction_hour
0 2024-09-01 00:05:51 2024-09-01 00:45:03 1.0 9.80 1.0 138 48 1 79.79 2024-09-01 2024 9 1 0
1 2024-09-01 00:59:35 2024-09-01 01:03:43 1.0 0.50 1.0 140 141 1 13.10 2024-09-01 2024 9 1 0
2 2024-09-01 00:25:00 2024-09-01 00:34:37 2.0 2.29 1.0 238 152 2 16.00 2024-09-01 2024 9 1 0
3 2024-09-01 00:31:00 2024-09-01 00:46:52 1.0 5.20 1.0 93 130 1 31.75 2024-09-01 2024 9 1 0
4 2024-09-01 00:11:57 2024-09-01 00:30:41 2.0 2.26 1.0 79 231 1 26.40 2024-09-01 2024 9 1 0
In [39]:
taxi_data_prepared['transaction_year'].unique()
Out[39]:
array([2024, 2008, 2009], dtype=int32)
In [40]:
taxi_data_prepared[taxi_data_prepared['transaction_year']!=2024]['payment_type'].value_counts()
Out[40]:
payment_type
1    2
2    1
Name: count, dtype: int64
In [41]:
taxi_data_prepared[taxi_data_prepared['transaction_month']!=9]['payment_type'].value_counts()
Out[41]:
payment_type
1    38
2     9
4     1
Name: count, dtype: int64
In [ ]:
 
In [42]:
taxi_data_prepared=taxi_data_prepared[taxi_data_prepared['transaction_year']==2024]
taxi_data_prepared=taxi_data_prepared[taxi_data_prepared['transaction_month']==9]
In [43]:
taxi_data_prepared.shape
Out[43]:
(3092686, 14)
In [44]:
#Noting down the categorical and numerical columns
taxi_data_prepared=taxi_data_prepared[taxi_data_prepared['passenger_count']>0]
taxi_data_prepared.shape
Out[44]:
(3062367, 14)
In [45]:
taxi_data_prepared.head()
Out[45]:
tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance RatecodeID PULocationID DOLocationID payment_type total_amount transaction_date transaction_year transaction_month transaction_day transaction_hour
0 2024-09-01 00:05:51 2024-09-01 00:45:03 1.0 9.80 1.0 138 48 1 79.79 2024-09-01 2024 9 1 0
1 2024-09-01 00:59:35 2024-09-01 01:03:43 1.0 0.50 1.0 140 141 1 13.10 2024-09-01 2024 9 1 0
2 2024-09-01 00:25:00 2024-09-01 00:34:37 2.0 2.29 1.0 238 152 2 16.00 2024-09-01 2024 9 1 0
3 2024-09-01 00:31:00 2024-09-01 00:46:52 1.0 5.20 1.0 93 130 1 31.75 2024-09-01 2024 9 1 0
4 2024-09-01 00:11:57 2024-09-01 00:30:41 2.0 2.26 1.0 79 231 1 26.40 2024-09-01 2024 9 1 0
In [46]:
categorical_columns=['PULocationID','transaction_date','transaction_month','transaction_day','transaction_hour']
numerical_columns=['trip_distance','total_amount']
all_needed_columns=categorical_columns+numerical_columns
In [47]:
main_taxi_df=taxi_data_prepared[all_needed_columns]
print(main_taxi_df.shape)
main_taxi_df.head()
(3062367, 7)
Out[47]:
PULocationID transaction_date transaction_month transaction_day transaction_hour trip_distance total_amount
0 138 2024-09-01 9 1 0 9.80 79.79
1 140 2024-09-01 9 1 0 0.50 13.10
2 238 2024-09-01 9 1 0 2.29 16.00
3 93 2024-09-01 9 1 0 5.20 31.75
4 79 2024-09-01 9 1 0 2.26 26.40
In [48]:
# Reset the index of the DataFrame to a default integer-based index
taxi_grouped_by_region=main_taxi_df.groupby(categorical_columns).mean().reset_index()
taxi_grouped_by_region['count_of_transactions'] = main_taxi_df.groupby(
# Reset the index of the DataFrame to a default integer-based index
                                                    categorical_columns).count().reset_index()['total_amount']
print(taxi_grouped_by_region.shape)
taxi_grouped_by_region.head()
#Aggregated the columns for the efficiency
(76481, 8)
Out[48]:
PULocationID transaction_date transaction_month transaction_day transaction_hour trip_distance total_amount count_of_transactions
0 1 2024-09-01 9 1 5 0.00 116.00 1
1 1 2024-09-01 9 1 8 12.74 52.50 1
2 1 2024-09-01 9 1 10 0.04 121.00 1
3 1 2024-09-01 9 1 11 15.63 141.96 1
4 1 2024-09-01 9 1 13 0.00 81.01 1
In [49]:
#model building
data_for_benchmark_model=taxi_grouped_by_region.copy()
In [50]:
categorical_features_benchmark=['PULocationID','transaction_month','transaction_day','transaction_hour']
input_features_benchmark=categorical_features_benchmark+['trip_distance']
target_feature_benchmark='total_amount'
In [51]:
#train model and test model
In [52]:
# Import required libraries or modules
from sklearn.model_selection import train_test_split

X_bench=data_for_benchmark_model[input_features_benchmark]
y_bench=data_for_benchmark_model[target_feature_benchmark]

X_bench=pd.get_dummies(X_bench)

X_train,X_test,y_train,y_test=train_test_split(X_bench,y_bench,test_size=0.2,random_state=10)
In [53]:
X_bench.head()
Out[53]:
transaction_month transaction_day transaction_hour trip_distance PULocationID_1 PULocationID_10 PULocationID_100 PULocationID_101 PULocationID_102 PULocationID_106 ... PULocationID_9 PULocationID_90 PULocationID_91 PULocationID_92 PULocationID_93 PULocationID_94 PULocationID_95 PULocationID_96 PULocationID_97 PULocationID_98
0 9 1 5 0.00 True False False False False False ... False False False False False False False False False False
1 9 1 8 12.74 True False False False False False ... False False False False False False False False False False
2 9 1 10 0.04 True False False False False False ... False False False False False False False False False False
3 9 1 11 15.63 True False False False False False ... False False False False False False False False False False
4 9 1 13 0.00 True False False False False False ... False False False False False False False False False False

5 rows × 258 columns

In [54]:
#Fitting the model to the data
#decision tree Model
In [55]:
# Import required libraries or modules
from sklearn.tree import DecisionTreeRegressor

tree=DecisionTreeRegressor(max_depth=10,random_state=10)
tree.fit(X_train,y_train)
tree.score(X_test,y_test)
Out[55]:
0.5603395914225617
In [56]:
# Import required libraries or modules
#Performing Principal Component Analysis It ain't improving
from sklearn.decomposition import PCA

pca=PCA(n_components=200,random_state=10)
X_train_pca=pca.fit_transform(X_train)
X_test_pca=pca.transform(X_test)
In [57]:
tree.fit(X_train_pca,y_train)
tree.score(X_test_pca,y_test)
Out[57]:
0.5562141491603332
In [58]:
# Import required libraries or modules
from sklearn.model_selection import cross_val_score
# Performing Cross Evaluation 
score=cross_val_score(tree,X_bench,y_bench,cv=5)
print(score)
np.average(score)
# the scores are low and it indicates , the model is not perfect
[0.4568568  0.34361443 0.42339666 0.30446022 0.41343122]
Out[58]:
0.38835186325955096
In [59]:
# Import required libraries or modules
#Performing Random Forest Regression
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(max_depth=100)
In [60]:
model.fit(X_train,y_train)
model.score(X_test,y_test)
Out[60]:
0.6218083550457949
In [61]:
#6)feature engineering
taxi_grouped_by_region.head()
Out[61]:
PULocationID transaction_date transaction_month transaction_day transaction_hour trip_distance total_amount count_of_transactions
0 1 2024-09-01 9 1 5 0.00 116.00 1
1 1 2024-09-01 9 1 8 12.74 52.50 1
2 1 2024-09-01 9 1 10 0.04 121.00 1
3 1 2024-09-01 9 1 11 15.63 141.96 1
4 1 2024-09-01 9 1 13 0.00 81.01 1
In [62]:
data_with_new_features=taxi_grouped_by_region.copy()
In [63]:
#Merging the Data 
data_with_new_features['transaction_week_day']=data_with_new_features['transaction_date'].dt.weekday
# Monday starts with 0 
data_with_new_features['weekend']=data_with_new_features['transaction_week_day'].apply(
                                    lambda x:True if x==5 or x==6 else False)
In [64]:
# Import required libraries or modules
from pandas.tseries.holiday import USFederalHolidayCalendar
#In federal calendar merging with the dates
cal=USFederalHolidayCalendar()
holidays=cal.holidays(start='2023',end='2025').date
data_with_new_features['is_holiday']=data_with_new_features['transaction_date'].isin(holidays)
In [65]:
data_with_new_features.head()
Out[65]:
PULocationID transaction_date transaction_month transaction_day transaction_hour trip_distance total_amount count_of_transactions transaction_week_day weekend is_holiday
0 1 2024-09-01 9 1 5 0.00 116.00 1 6 True False
1 1 2024-09-01 9 1 8 12.74 52.50 1 6 True False
2 1 2024-09-01 9 1 10 0.04 121.00 1 6 True False
3 1 2024-09-01 9 1 11 15.63 141.96 1 6 True False
4 1 2024-09-01 9 1 13 0.00 81.01 1 6 True False
In [66]:
# Load data into the DataFrame or variable
#Combining the data to the old dataset, adding to the location zones
zone_lookup=pd.read_csv('/Users/udaykola/Downloads/taxi-zone-lookup.csv')
zone_lookup=zone_lookup[['LocationID','Borough']]
zone_lookup['LocationID']=zone_lookup['LocationID'].astype(str)
zone_lookup.head()
Out[66]:
LocationID Borough
0 1 EWR
1 2 Queens
2 3 Bronx
3 4 Manhattan
4 5 Staten Island
In [67]:
# Merge or join two DataFrames
data_with_new_features=data_with_new_features.merge(
                        zone_lookup,left_on='PULocationID',right_on='LocationID',how='left')
data_with_new_features.head()
Out[67]:
PULocationID transaction_date transaction_month transaction_day transaction_hour trip_distance total_amount count_of_transactions transaction_week_day weekend is_holiday LocationID Borough
0 1 2024-09-01 9 1 5 0.00 116.00 1 6 True False 1 EWR
1 1 2024-09-01 9 1 8 12.74 52.50 1 6 True False 1 EWR
2 1 2024-09-01 9 1 10 0.04 121.00 1 6 True False 1 EWR
3 1 2024-09-01 9 1 11 15.63 141.96 1 6 True False 1 EWR
4 1 2024-09-01 9 1 13 0.00 81.01 1 6 True False 1 EWR
In [68]:
data_with_new_features['Borough'].value_counts()

# Import required libraries or modules
import plotly.express as px
# Import required libraries or modules
import pandas as pd

# Data
borough_data = {
    'Borough': ['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Unknown', 'EWR', 'Staten Island'],
    'Pickup_Count': [41057, 14587, 14480, 4700, 1301, 273, 83]
}

# Create or manipulate a DataFrame
# Create a DataFrame
# Create or manipulate a DataFrame
df = pd.DataFrame(borough_data)

# Create the bar chart
fig = px.bar(
    df,
    x='Borough',
    y='Pickup_Count',
    title='Most Picked-Up Locations by Borough',
    labels={'Pickup_Count': 'Number of Pickups'},
    text='Pickup_Count',
    color='Pickup_Count',  # Add color to differentiate values
    color_continuous_scale='Viridis'  # Use a visually appealing color scale
)

# Customize the layout
fig.update_traces(
    textposition='outside',
    marker_line_width=1.5,
    marker_line_color='black'
)
fig.update_layout(
    xaxis_title='Borough',
    yaxis_title='Number of Pickups',
    title_x=0.5,
    title_font_size=20,
    xaxis_tickangle=-45,  # Rotate x-axis labels for better visibility
    height=600,  # Increase the height of the chart
    width=1000,  # Increase the width of the chart
    font=dict(size=14),
# Load data into the DataFrame or variable
    margin=dict(l=50, r=50, t=80, b=150),  # Adjust margins for readability
)

# Show the chart
# Display the result or DataFrame contents
fig.show()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [69]:
pip install selenium
Requirement already satisfied: selenium in /opt/anaconda3/lib/python3.12/site-packages (4.27.1)
Requirement already satisfied: urllib3<3,>=1.26 in /opt/anaconda3/lib/python3.12/site-packages (from urllib3[socks]<3,>=1.26->selenium) (2.2.3)
Requirement already satisfied: trio~=0.17 in /opt/anaconda3/lib/python3.12/site-packages (from selenium) (0.27.0)
Requirement already satisfied: trio-websocket~=0.9 in /opt/anaconda3/lib/python3.12/site-packages (from selenium) (0.11.1)
Requirement already satisfied: certifi>=2021.10.8 in /opt/anaconda3/lib/python3.12/site-packages (from selenium) (2024.8.30)
Requirement already satisfied: typing_extensions~=4.9 in /opt/anaconda3/lib/python3.12/site-packages (from selenium) (4.11.0)
Requirement already satisfied: websocket-client~=1.8 in /opt/anaconda3/lib/python3.12/site-packages (from selenium) (1.8.0)
Requirement already satisfied: attrs>=23.2.0 in /opt/anaconda3/lib/python3.12/site-packages (from trio~=0.17->selenium) (24.2.0)
Requirement already satisfied: sortedcontainers in /opt/anaconda3/lib/python3.12/site-packages (from trio~=0.17->selenium) (2.4.0)
Requirement already satisfied: idna in /opt/anaconda3/lib/python3.12/site-packages (from trio~=0.17->selenium) (3.7)
Requirement already satisfied: outcome in /opt/anaconda3/lib/python3.12/site-packages (from trio~=0.17->selenium) (1.3.0.post0)
Requirement already satisfied: sniffio>=1.3.0 in /opt/anaconda3/lib/python3.12/site-packages (from trio~=0.17->selenium) (1.3.0)
Requirement already satisfied: wsproto>=0.14 in /opt/anaconda3/lib/python3.12/site-packages (from trio-websocket~=0.9->selenium) (1.2.0)
Requirement already satisfied: pysocks!=1.5.7,<2.0,>=1.5.6 in /opt/anaconda3/lib/python3.12/site-packages (from urllib3[socks]<3,>=1.26->selenium) (1.7.1)
Requirement already satisfied: h11<1,>=0.9.0 in /opt/anaconda3/lib/python3.12/site-packages (from wsproto>=0.14->trio-websocket~=0.9->selenium) (0.14.0)
Note: you may need to restart the kernel to use updated packages.
In [70]:
# Import required libraries or modules
# from selenium import webdriver
# Import required libraries or modules
# from selenium.webdriver.common.by import By
# Import required libraries or modules
# import pandas as pd
# Import required libraries or modules
# import time

# # Set up Selenium WebDriver
# options = webdriver.ChromeOptions()
# options.add_argument('--headless')  # Run in headless mode (no browser UI)
# driver = webdriver.Chrome(options=options)

# # Open the website
# url = "https://www.wunderground.com/history/monthly/us/ny/new-york-city/KLGA/date/2024-1"
# driver.get(url)
# Load data into the DataFrame or variable
# time.sleep(5)  # Wait for the page to load

# # Locate the table
# table = driver.find_element(By.CLASS_NAME, "days")  # Adjust as needed

# # Extract rows
# rows = table.find_elements(By.TAG_NAME, "tr")
# data = []
# for row in rows[1:]:  # Skip header row
#     cols = row.find_elements(By.TAG_NAME, "td")
#     if len(cols) > 1:
#         date = cols[0].text.strip()
#         avg_temp = cols[2].text.strip()  # Assuming "Avg" is the 3rd column
#         data.append([date, avg_temp])

# Create or manipulate a DataFrame
# # Create a DataFrame and save to CSV
# Create or manipulate a DataFrame
# df = pd.DataFrame(data, columns=["Date", "Average Temperature"])
# df.to_csv("average_daily_temperatures.csv", index=False)

# # Clean up
# driver.quit()
# print("Data saved to 'average_daily_temperatures.csv'.")
In [71]:
# Import required libraries or modules
import pandas as pd

# Read the CSV file with the specified encoding
# Load data into the DataFrame or variable
nyc_weather = pd.read_csv('/Users/udaykola/Downloads/sep_weather_data.csv', encoding='ISO-8859-1')

# Clean the dataset by removing the 'Ê' character from all string columns
nyc_weather = nyc_weather.applymap(lambda x: x.replace('Ê', '').strip() if isinstance(x, str) else x)

# Export the cleaned dataset to a new CSV file
# Load data into the DataFrame or variable
cleaned_file_path = '/Users/udaykola/Downloads/sep_weather_data_cleaned.csv'
nyc_weather.to_csv(cleaned_file_path, index=False, encoding='utf-8')

# Display the cleaned dataset's head
print("Cleaned data preview:")
print(nyc_weather.head())
print(f"Cleaned data has been saved to: {cleaned_file_path}")
Cleaned data preview:
         Date Temperature  Dew Point Humidity Wind speed(mph) Pressure (in)  \
0         Sep          Avg       Avg      Avg             Avg           Avg   
1  1-Sep-2024        25.22     19.89     72.6             7.6          29.9   
2  2-Sep-2024        22.61     11.11     49.2            13.2          30.0   
3  3-Sep-2024        19.33      5.56     41.8             8.3          30.3   
4  4-Sep-2024           21       8.5     45.7             6.0          30.4   

  Precipitation  
0         Total  
1          0.00  
2          0.00  
3          0.00  
4          0.00  
Cleaned data has been saved to: /Users/udaykola/Downloads/sep_weather_data_cleaned.csv
In [72]:
nyc_weather['Precipitation'].unique()
Out[72]:
array(['Total', '0.00', '0.32', '0.01', '0.07', '0.05', '0.22', '0.64'],
      dtype=object)
In [73]:
nyc_weather['Humidity'].unique()
Out[73]:
array(['Avg', '72.6', '49.2', '41.8', '45.7', '58.7', '73.5', '74.2',
       '48.6', '40.1', '45.3', '47.2', '67.2', '61.6', '66.0', '65.8',
       '61.7', '70.4', '76.3', '54.1', '62.4', '60.2', '62.2', '63.9',
       '83.4', '74.7', '83.0', '82.7', '69.7'], dtype=object)
In [74]:
# Ensure the Precipitation column is numeric
nyc_weather['Precipitation'] = pd.to_numeric(nyc_weather['Precipitation'], errors='coerce')

# Fill NaN values (if any) with 0 or another default value
nyc_weather['Precipitation'] = nyc_weather['Precipitation'].fillna(0)

# Apply the weather type logic
nyc_weather['Weather_Type'] = nyc_weather['Precipitation'].apply(
    lambda x: 'sunny' if x == 0 else ('cloudy' if x < 0.1 else 'rainy')
)

# Preview the results
print(nyc_weather[['Precipitation', 'Weather_Type']].head())
   Precipitation Weather_Type
0            0.0        sunny
1            0.0        sunny
2            0.0        sunny
3            0.0        sunny
4            0.0        sunny
In [75]:
# Inspect the unique values in the Date column
print(nyc_weather['Date'].unique())

# Replace invalid entries like "Sep" with NaN
nyc_weather['Date'] = nyc_weather['Date'].replace(r'^\D+$', None, regex=True)

# Convert to datetime and coerce invalid values to NaT
nyc_weather['Date'] = pd.to_datetime(nyc_weather['Date'], format='%d-%b-%Y', errors='coerce')

# Drop rows where Date is NaT (if needed)
nyc_weather = nyc_weather.dropna(subset=['Date'])

# Extract month and day
nyc_weather['month'] = nyc_weather['Date'].dt.month
nyc_weather['day'] = nyc_weather['Date'].dt.day

# Preview the cleaned dataset
print(nyc_weather[['Date', 'month', 'day']].head())
['Sep' '1-Sep-2024' '2-Sep-2024' '3-Sep-2024' '4-Sep-2024' '5-Sep-2024'
 '6-Sep-2024' '7-Sep-2024' '8-Sep-2024' '9-Sep-2024' '10-Sep-2024'
 '11-Sep-2024' '12-Sep-2024' '13-Sep-2024' '14-Sep-2024' '15-Sep-2024'
 '16-Sep-2024' '17-Sep-2024' '18-Sep-2024' '19-Sep-2024' '20-Sep-2024'
 '21-Sep-2024' '22-Sep-2024' '23-Sep-2024' '24-Sep-2024' '25-Sep-2024'
 '26-Sep-2024' '27-Sep-2024' '28-Sep-2024' '29-Sep-2024' '30-Sep-2024']
        Date  month  day
1 2024-09-01      9    1
2 2024-09-02      9    2
3 2024-09-03      9    3
4 2024-09-04      9    4
5 2024-09-05      9    5
In [76]:
nyc_weather.head()
Out[76]:
Date Temperature Dew Point Humidity Wind speed(mph) Pressure (in) Precipitation Weather_Type month day
1 2024-09-01 25.22 19.89 72.6 7.6 29.9 0.0 sunny 9 1
2 2024-09-02 22.61 11.11 49.2 13.2 30.0 0.0 sunny 9 2
3 2024-09-03 19.33 5.56 41.8 8.3 30.3 0.0 sunny 9 3
4 2024-09-04 21 8.5 45.7 6.0 30.4 0.0 sunny 9 4
5 2024-09-05 20.83 12.33 58.7 8.8 30.3 0.0 sunny 9 5
In [ ]:
 
In [77]:
# Rename the column to remove trailing space
nyc_weather.rename(columns={'Temperature ': 'Temperature'}, inplace=True)

# Sort the data by Date
nyc_weather = nyc_weather.sort_values(by='Date')

# Create the bar chart
fig = px.bar(
    nyc_weather,
    x='Temperature',  # Corrected column name
    y='Date',
    title='Temperature Trend Over Time (Bar Chart)',
    labels={'Temperature': 'Temperature (°C)', 'Date': 'Date'},
    orientation='h',
    color='Temperature',  # Corrected column name
    color_continuous_scale='Viridis'
)

# Customize the layout
fig.update_layout(
    title_font_size=20,
    xaxis_title='Temperature (°C)',
    yaxis_title='Date',
    height=700,
    width=1000,
    margin=dict(l=50, r=50, t=80, b=100),
    font=dict(size=14),
# Generate a plot or visualize data
    plot_bgcolor='rgba(240,240,240,1)',
    paper_bgcolor='rgba(240,240,240,1)'
)

# Show the chart
# Display the result or DataFrame contents
fig.show()
In [78]:
#Whole Temperature data After cleaning 
nyc_weather.columns = nyc_weather.columns.str.strip()
nyc_weather['Temperature']=nyc_weather['Temperature'].round(1)
nyc_weather.head()
Out[78]:
Date Temperature Dew Point Humidity Wind speed(mph) Pressure (in) Precipitation Weather_Type month day
1 2024-09-01 25.22 19.89 72.6 7.6 29.9 0.0 sunny 9 1
2 2024-09-02 22.61 11.11 49.2 13.2 30.0 0.0 sunny 9 2
3 2024-09-03 19.33 5.56 41.8 8.3 30.3 0.0 sunny 9 3
4 2024-09-04 21 8.5 45.7 6.0 30.4 0.0 sunny 9 4
5 2024-09-05 20.83 12.33 58.7 8.8 30.3 0.0 sunny 9 5
In [ ]:
 
In [79]:
# Merge or join two DataFrames--Final Dataset After merging everything
nyc_taxi_with_weather=data_with_new_features.merge(nyc_weather,left_on=['transaction_month','transaction_day'],
                                                  right_on=['month','day'],how='left')
print(nyc_taxi_with_weather.shape)
nyc_taxi_with_weather.head()
(76481, 23)
Out[79]:
PULocationID transaction_date transaction_month transaction_day transaction_hour trip_distance total_amount count_of_transactions transaction_week_day weekend ... Date Temperature Dew Point Humidity Wind speed(mph) Pressure (in) Precipitation Weather_Type month day
0 1 2024-09-01 9 1 5 0.00 116.00 1 6 True ... 2024-09-01 25.22 19.89 72.6 7.6 29.9 0.0 sunny 9 1
1 1 2024-09-01 9 1 8 12.74 52.50 1 6 True ... 2024-09-01 25.22 19.89 72.6 7.6 29.9 0.0 sunny 9 1
2 1 2024-09-01 9 1 10 0.04 121.00 1 6 True ... 2024-09-01 25.22 19.89 72.6 7.6 29.9 0.0 sunny 9 1
3 1 2024-09-01 9 1 11 15.63 141.96 1 6 True ... 2024-09-01 25.22 19.89 72.6 7.6 29.9 0.0 sunny 9 1
4 1 2024-09-01 9 1 13 0.00 81.01 1 6 True ... 2024-09-01 25.22 19.89 72.6 7.6 29.9 0.0 sunny 9 1

5 rows × 23 columns

In [80]:
nyc_taxi_with_weather.drop(['Date','month','day','LocationID'],axis=1,inplace=True)
In [81]:
# Check for null values in the specified column
nyc_taxi_with_weather.isnull().sum()
Out[81]:
PULocationID             0
transaction_date         0
transaction_month        0
transaction_day          0
transaction_hour         0
trip_distance            0
total_amount             0
count_of_transactions    0
transaction_week_day     0
weekend                  0
is_holiday               0
Borough                  0
Temperature              0
Dew Point                0
Humidity                 0
Wind speed(mph)          0
Pressure (in)            0
Precipitation            0
Weather_Type             0
dtype: int64
In [82]:
nyc_taxi_with_weather.dropna(inplace=True)
print(nyc_taxi_with_weather.shape)
(76481, 19)
In [83]:
# Check for null values in the specified column
nyc_taxi_with_weather.isnull().sum()
Out[83]:
PULocationID             0
transaction_date         0
transaction_month        0
transaction_day          0
transaction_hour         0
trip_distance            0
total_amount             0
count_of_transactions    0
transaction_week_day     0
weekend                  0
is_holiday               0
Borough                  0
Temperature              0
Dew Point                0
Humidity                 0
Wind speed(mph)          0
Pressure (in)            0
Precipitation            0
Weather_Type             0
dtype: int64
In [84]:
nyc_taxi_with_weather.head()
Out[84]:
PULocationID transaction_date transaction_month transaction_day transaction_hour trip_distance total_amount count_of_transactions transaction_week_day weekend is_holiday Borough Temperature Dew Point Humidity Wind speed(mph) Pressure (in) Precipitation Weather_Type
0 1 2024-09-01 9 1 5 0.00 116.00 1 6 True False EWR 25.22 19.89 72.6 7.6 29.9 0.0 sunny
1 1 2024-09-01 9 1 8 12.74 52.50 1 6 True False EWR 25.22 19.89 72.6 7.6 29.9 0.0 sunny
2 1 2024-09-01 9 1 10 0.04 121.00 1 6 True False EWR 25.22 19.89 72.6 7.6 29.9 0.0 sunny
3 1 2024-09-01 9 1 11 15.63 141.96 1 6 True False EWR 25.22 19.89 72.6 7.6 29.9 0.0 sunny
4 1 2024-09-01 9 1 13 0.00 81.01 1 6 True False EWR 25.22 19.89 72.6 7.6 29.9 0.0 sunny
In [85]:
#Final Model training
data_for_model=nyc_taxi_with_weather.copy()
In [86]:
categorical_features = ['PULocationID', 'transaction_month', 'transaction_day',
       'transaction_hour', 'transaction_week_day', 'weekend', 'is_holiday', 'Borough','Weather_Type']
input_features=categorical_features+['trip_distance','Humidity','Wind speed(mph)','Precipitation','Temperature']
target_feature='total_amount'
In [87]:
#Train_test_Split
# Import required libraries or modules
from sklearn.model_selection import train_test_split

X = data_for_model[input_features]
y = data_for_model[target_feature]

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
In [88]:
#Decision Tree
# Import required libraries or modules
from sklearn.tree import DecisionTreeRegressor

decision_tree=DecisionTreeRegressor(max_depth=10)
decision_tree.fit(X_train,y_train)
decision_tree.score(X_test,y_test)
Out[88]:
0.5440474913409042
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [89]:
# Import required libraries or modules-Cross evaluation
from sklearn.model_selection import cross_val_score

print(np.average(cross_val_score(decision_tree,X,y,cv=4)))
0.44656519009400214
In [90]:
# Import required libraries or modules--Random Forest
from sklearn.ensemble import RandomForestRegressor

ran_for=RandomForestRegressor(max_depth=100)
ran_for.fit(X_train,y_train)
ran_for.score(X_test,y_test)
Out[90]:
0.6253728521390483
In [91]:
print(np.average(cross_val_score(ran_for,X,y,cv=4)))
0.4084399751491907
In [92]:
# Import required libraries or modules--Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor

grad_boost=GradientBoostingRegressor()
grad_boost.fit(X_train,y_train)
grad_boost.score(X_test,y_test)
Out[92]:
0.6097808967547245
In [93]:
#Cross Evaluation of Gradient Boosting
score3=cross_val_score(grad_boost,X,y,cv=4)
print(score3)
print(np.average(score3))
[0.54236789 0.40341494 0.33071653 0.54390088]
0.45510006032010986
In [94]:
# Import required libraries or modules
from sklearn.metrics import mean_squared_error

print("Mean Squared Error:")
y_pred_1=decision_tree.predict(X_test)
print("Decision Tree:",np.sqrt(mean_squared_error(y_test,y_pred_1)))

y_pred_2=ran_for.predict(X_test)
print("Random Forest:",np.sqrt(mean_squared_error(y_test,y_pred_2)))

y_pred_3=grad_boost.predict(X_test)
print("Gradient Boosting:",np.sqrt(mean_squared_error(y_test,y_pred_3)))
Mean Squared Error:
Decision Tree: 15.246643336552005
Random Forest: 13.820191445538459
Gradient Boosting: 14.104857390269597
In [95]:
#Hyper parameter Tuning for accuracy increase on Random Forest
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [10,20,50,100,150,200,300,500]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 20, 40]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 10, 20]
# Method of selecting samples for training each tree
bootstrap = [True, False]
In [96]:
#using Randomized search for faster results
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
In [97]:
# Import required libraries or modules
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10,
                                   cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

# Create or manipulate a DataFrame
performances = pd.DataFrame.from_dict(rf_random.cv_results_)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   8.6s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1200; total time= 2.7min
In [98]:
performances=performances.sort_values(by='rank_test_score')
performances
Out[98]:
mean_fit_time std_fit_time mean_score_time std_score_time param_n_estimators param_min_samples_split param_min_samples_leaf param_max_features param_max_depth param_bootstrap params split0_test_score split1_test_score split2_test_score mean_test_score std_test_score rank_test_score
0 31.079307 0.236847 1.682551 0.061543 200 10 1 sqrt 200 False {'n_estimators': 200, 'min_samples_split': 10,... 0.575766 0.589501 0.591067 0.585445 0.006874 1
9 68.634153 0.715173 2.943837 0.209944 600 10 2 sqrt 500 False {'n_estimators': 600, 'min_samples_split': 10,... 0.578348 0.586188 0.591214 0.585250 0.005294 2
5 157.843519 2.504911 6.372407 0.356554 1200 5 1 sqrt 100 False {'n_estimators': 1200, 'min_samples_split': 5,... 0.564866 0.581510 0.587448 0.577941 0.009558 3
7 82.204359 3.291071 4.174740 0.894944 1400 10 10 sqrt None True {'n_estimators': 1400, 'min_samples_split': 10... 0.548150 0.558322 0.559302 0.555258 0.005042 4
2 10.958274 0.124924 0.416695 0.006461 200 40 4 sqrt 20 False {'n_estimators': 200, 'min_samples_split': 40,... 0.504846 0.517145 0.512012 0.511334 0.005044 5
1 8.379104 0.180452 0.383341 0.018673 200 5 4 sqrt 20 True {'n_estimators': 200, 'min_samples_split': 5, ... 0.502565 0.513743 0.515682 0.510663 0.005781 6
6 27.816893 0.082167 0.773720 0.024157 1400 5 20 sqrt 10 True {'n_estimators': 1400, 'min_samples_split': 5,... 0.398235 0.404503 0.408053 0.403597 0.004059 7
3 0.043025 0.030238 0.000000 0.000000 400 40 10 auto 20 False {'n_estimators': 400, 'min_samples_split': 40,... NaN NaN NaN NaN NaN 8
4 0.013686 0.003050 0.000000 0.000000 1000 20 20 auto 50 False {'n_estimators': 1000, 'min_samples_split': 20... NaN NaN NaN NaN NaN 8
8 0.013402 0.001718 0.000000 0.000000 1800 10 2 auto 50 False {'n_estimators': 1800, 'min_samples_split': 10... NaN NaN NaN NaN NaN 8
In [99]:
#Performing Classification model
nyc_class=nyc_taxi_with_weather.copy()
nyc_class['earning_class']=nyc_class['total_amount'].apply(lambda x:'low' if x<=15 else 'high')
nyc_class['earning_class_binary']=nyc_class['earning_class'].apply(lambda x:0 if x=='low' else 1)
nyc_class.head()
Out[99]:
PULocationID transaction_date transaction_month transaction_day transaction_hour trip_distance total_amount count_of_transactions transaction_week_day weekend ... Borough Temperature Dew Point Humidity Wind speed(mph) Pressure (in) Precipitation Weather_Type earning_class earning_class_binary
0 1 2024-09-01 9 1 5 0.00 116.00 1 6 True ... EWR 25.22 19.89 72.6 7.6 29.9 0.0 sunny high 1
1 1 2024-09-01 9 1 8 12.74 52.50 1 6 True ... EWR 25.22 19.89 72.6 7.6 29.9 0.0 sunny high 1
2 1 2024-09-01 9 1 10 0.04 121.00 1 6 True ... EWR 25.22 19.89 72.6 7.6 29.9 0.0 sunny high 1
3 1 2024-09-01 9 1 11 15.63 141.96 1 6 True ... EWR 25.22 19.89 72.6 7.6 29.9 0.0 sunny high 1
4 1 2024-09-01 9 1 13 0.00 81.01 1 6 True ... EWR 25.22 19.89 72.6 7.6 29.9 0.0 sunny high 1

5 rows × 21 columns

In [100]:
#Creating a new Target Variable 
nyc_class['earning_class'].value_counts()
Out[100]:
earning_class
high    72283
low      4198
Name: count, dtype: int64
In [101]:
#Preparing for the model Building
categorical_features = ['PULocationID', 'transaction_month', 'transaction_day',
       'transaction_hour', 'transaction_week_day', 'weekend', 'is_holiday', 'Borough','Weather_Type']
input_features = categorical_features + ['trip_distance','Humidity','Wind speed(mph)','Precipitation','Temperature']
target_feature = 'earning_class_binary'
In [102]:
# Import required libraries or modules
from sklearn.model_selection import train_test_split

X_c = nyc_class[input_features]
y_c = nyc_class[target_feature]

X_c = pd.get_dummies(X_c)

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.33, random_state=10)
In [ ]:
 
In [103]:
X_c.head()
Out[103]:
transaction_month transaction_day transaction_hour transaction_week_day weekend is_holiday trip_distance Precipitation PULocationID_1 PULocationID_10 ... Temperature_21 Temperature_21.17 Temperature_21.78 Temperature_22.11 Temperature_22.39 Temperature_22.61 Temperature_22.78 Temperature_23.17 Temperature_24.06 Temperature_25.22
0 9 1 5 6 True False 0.00 0.0 True False ... False False False False False False False False False True
1 9 1 8 6 True False 12.74 0.0 True False ... False False False False False False False False False True
2 9 1 10 6 True False 0.04 0.0 True False ... False False False False False False False False False True
3 9 1 11 6 True False 15.63 0.0 True False ... False False False False False False False False False True
4 9 1 13 6 True False 0.00 0.0 True False ... False False False False False False False False False True

5 rows × 351 columns

In [104]:
# Import required libraries or modules
#random Forest regression after the Hyper Parameter Tuning
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train_c, y_train_c)
clf.score(X_test_c,y_test_c)
Out[104]:
0.9581599904909069
In [105]:
print(np.average(cross_val_score(clf,X_c,y_c,cv=4)))
0.9140827516581804
In [106]:
# Create the feature importance DataFrame
importance_df = pd.DataFrame({
    'Feature': X_train.columns,  # Replace with actual feature names
    'Importance': ran_for.feature_importances_  # Extract feature importances
}).sort_values(by='Importance', ascending=False)

# Display the top features for verification
print(importance_df.head(10))
                  Feature  Importance
6           trip_distance    0.476360
2        transaction_hour    0.072732
184      PULocationID_265    0.054567
266        Borough_Queens    0.042112
1         transaction_day    0.025521
3    transaction_week_day    0.015892
131      PULocationID_216    0.015718
8          PULocationID_1    0.010688
264           Borough_EWR    0.008706
265     Borough_Manhattan    0.006942
In [ ]:
 
In [ ]:
 
In [107]:
# Import required libraries or modules
from sklearn.model_selection import cross_val_score

# Check accuracy explicitly
accuracy = np.average(cross_val_score(clf, X_c, y_c, cv=4, scoring='accuracy'))
print(f"Cross-validated Accuracy: {accuracy}")

# Check log loss
log_loss = np.average(cross_val_score(clf, X_c, y_c, cv=4, scoring='neg_log_loss'))
print(f"Cross-validated Log Loss: {-log_loss}")  # Negate to get positive log loss
Cross-validated Accuracy: 0.9117553410137029
Cross-validated Log Loss: 0.21449803233267062
In [108]:
# Define the target variable as the binary column
target_feature = 'earning_class_binary'
y_c = nyc_class[target_feature]

# Train the classifier
# Import required libraries or modules
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train_c, y_train_c)  # Ensure y_train_c is categorical (e.g., 0 or 1)
print(f"Test Accuracy: {clf.score(X_test_c, y_test_c):.2f}")
Test Accuracy: 0.96
In [109]:
# Import required libraries or modules
from sklearn.metrics import mean_squared_error

# Step 1: Get predictions (using class labels)
y_pred = clf.predict(X_test_c)

# Step 2: Compute MSE
mse = mean_squared_error(y_test_c, y_pred)
print("Mean Squared Error:", mse)
Mean Squared Error: 0.040928721423194266
In [ ]:
 
In [110]:
# Import required libraries or modules
from sklearn.metrics import mean_absolute_error

# Get predictions
y_pred = clf.predict(X_test_c)

# Compute MAE
mae = mean_absolute_error(y_test_c, y_pred)
print("Mean Absolute Error:", mae)
Mean Absolute Error: 0.040928721423194266
In [111]:
# Import required libraries or modules
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Predictions
y_pred = clf.predict(X_test_c)

# MAE
mae = mean_absolute_error(y_test_c, y_pred)
print("Mean Absolute Error:", mae)

# MSE
mse = mean_squared_error(y_test_c, y_pred)
print("Mean Squared Error:", mse)
Mean Absolute Error: 0.040928721423194266
Mean Squared Error: 0.040928721423194266
In [112]:
# Sort features by importance
top_features = importance_df.head(10)  # Display top 10 features

# Plot
plt.figure(figsize=(10, 6))
plt.barh(top_features['Feature'], top_features['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Top 10 Features Importance for Taxi Fare Prediction')
plt.gca().invert_yaxis()
plt.show()
No description has been provided for this image
In [113]:
# Import required libraries or modules
# Set threshold for feature importance
threshold = 0.01

# Import required libraries or modules
# Separate important and less important features
# Import required libraries or modules
important_features = importance_df[importance_df['Importance'] >= threshold]
# Import required libraries or modules
other_features = importance_df[importance_df['Importance'] < threshold].sum(numeric_only=True)

# Perform group-by operation or aggregation
# Add aggregated "Other Features"
# Create or manipulate a DataFrame
other_row = pd.DataFrame({'Feature': ['Other Features'], 'Importance': [other_features['Importance']]})
# Import required libraries or modules
important_features = pd.concat([important_features, other_row], ignore_index=True)

# Plot
plt.figure(figsize=(10, 6))
# Import required libraries or modules
plt.barh(important_features['Feature'], important_features['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importance with Aggregated Minor Features')
plt.gca().invert_yaxis()
# Display the result or DataFrame contents
plt.show()
No description has been provided for this image
In [ ]: